/*
 * Copyright (c) 2018-2020 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifdef __aarch64__

#include <algorithm>

#include "arm_gemm.hpp"

#include "../../asmlib.hpp"
#include "../../utils.hpp"

namespace arm_gemm {

void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
    const int K_stride = K;
    const long loops_count = ((K + 4) / 8) - 1;
    K -= loops_count * 8;
    const long regs_count = (K / 4) - 1;
    K -= (regs_count + 1) * 4;
    const long blocks_count = K / 1;
    float nullbias[16];
    if (!accumulate && !bias) {
        memset(nullbias, 0, (16 * sizeof(float)));
    }
    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
    const float * const minptr = &minval;
    const float * const maxptr = &maxval;

    switch(act.type)
    {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            minval = 0.0f;
            break;
    }

    int rows_to_compute;

    for (int y=0; y<M; y+=rows_to_compute) {
        const float * const a_ptr0_base = A + (y * lda);
        const unsigned long ldab = lda * sizeof(float);

        float *c_ptr0 = C + (y * ldc);

        rows_to_compute = M-y;
        if (rows_to_compute > 4) {
            if (rows_to_compute % 4) {
                rows_to_compute = 4 - 1;
            } else {
                rows_to_compute = 4;
            }
        }

        for (int x0=0; x0<N; x0+=16ul) {
            const long width = std::min((unsigned long)N-x0, 16ul);
            long loops = loops_count;
            long regs = regs_count;
            long blocks = blocks_count;
            const float *a_ptr0 = a_ptr0_base;
            const float *b_ptr0 = B + (K_stride * x0);
            const bool use_result_buffer = (width < 16);
            float result_buffer[64];
            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
            float *c_ptr_real = c_ptr0;
            if (use_result_buffer && accumulate) {
                for(int cy=0; cy<std::min(M-y, 4); cy++) {
                    for(unsigned int cx=0; cx<width; cx++) {
                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
                    }
                }
            }
            if (use_result_buffer) {
                c_ptr0 = result_buffer;
            }
            const float *biasptr = bias ? bias+x0 : nullbias;

            switch(rows_to_compute) {
                case 1:
                    __asm __volatile (
                        "cbnz %[accumulate], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                    );
                    break;
                case 2:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "c_ptr1 .req X1\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "cbnz %[accumulate], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "mov v20.16b, v16.16b\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "mov v21.16b, v17.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v22.16b, v18.16b\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "mov v23.16b, v19.16b\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q20, [c_ptr1]\n"
                        "ldr q21, [c_ptr1, #0x10]\n"
                        "ldr q22, [c_ptr1, #0x20]\n"
                        "ldr q23, [c_ptr1, #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q5, [a_ptr1]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "fmla v20.4s, v8.4s, v5.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "fmla v21.4s, v9.4s, v5.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "fmla v22.4s, v10.4s, v5.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "fmla v23.4s, v11.4s, v5.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "fmla v20.4s, v8.4s, v5.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v21.4s, v9.4s, v5.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v22.4s, v10.4s, v5.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "fmla v23.4s, v11.4s, v5.s[3]\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q5, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "fmla v20.4s, v8.4s, v5.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "fmla v21.4s, v9.4s, v5.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "fmla v22.4s, v10.4s, v5.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "fmla v23.4s, v11.4s, v5.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v20.4s, v8.4s, v5.s[3]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v21.4s, v9.4s, v5.s[3]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v22.4s, v10.4s, v5.s[3]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "fmla v23.4s, v11.4s, v5.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr s1, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "fmax v20.4s, v20.4s, v14.4s\n"
                        "fmax v21.4s, v21.4s, v14.4s\n"
                        "fmax v22.4s, v22.4s, v14.4s\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "fmax v23.4s, v23.4s, v14.4s\n"
                        "fmin v20.4s, v20.4s, v15.4s\n"
                        "fmin v21.4s, v21.4s, v15.4s\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "fmin v22.4s, v22.4s, v15.4s\n"
                        "fmin v23.4s, v23.4s, v15.4s\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        "str q20, [c_ptr1]\n"
                        "str q21, [c_ptr1, #0x10]\n"
                        "str q22, [c_ptr1, #0x20]\n"
                        "str q23, [c_ptr1, #0x30]\n"
                        ".unreq a_ptr1\n"
                        ".unreq c_ptr1\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                    );
                    break;
                case 3:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "c_ptr1 .req X2\n"
                        "c_ptr2 .req X3\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "cbnz %[accumulate], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "mov v20.16b, v16.16b\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "mov v21.16b, v17.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v22.16b, v18.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v23.16b, v19.16b\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "mov v24.16b, v16.16b\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "mov v25.16b, v17.16b\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "mov v26.16b, v18.16b\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "mov v27.16b, v19.16b\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q20, [c_ptr1]\n"
                        "ldr q21, [c_ptr1, #0x10]\n"
                        "ldr q22, [c_ptr1, #0x20]\n"
                        "ldr q23, [c_ptr1, #0x30]\n"
                        "ldr q24, [c_ptr2]\n"
                        "ldr q25, [c_ptr2, #0x10]\n"
                        "ldr q26, [c_ptr2, #0x20]\n"
                        "ldr q27, [c_ptr2, #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q2, [a_ptr2]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr q5, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q6, [a_ptr2]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v24.4s, v8.4s, v2.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "fmla v25.4s, v9.4s, v2.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "fmla v26.4s, v10.4s, v2.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "fmla v27.4s, v11.4s, v2.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v24.4s, v8.4s, v2.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v25.4s, v9.4s, v2.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v26.4s, v10.4s, v2.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v27.4s, v11.4s, v2.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "fmla v20.4s, v8.4s, v5.s[1]\n"
                        "fmla v24.4s, v8.4s, v6.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "fmla v21.4s, v9.4s, v5.s[1]\n"
                        "fmla v25.4s, v9.4s, v6.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "fmla v22.4s, v10.4s, v5.s[1]\n"
                        "fmla v26.4s, v10.4s, v6.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "fmla v23.4s, v11.4s, v5.s[1]\n"
                        "fmla v27.4s, v11.4s, v6.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "fmla v20.4s, v8.4s, v5.s[3]\n"
                        "fmla v24.4s, v8.4s, v6.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v21.4s, v9.4s, v5.s[3]\n"
                        "fmla v25.4s, v9.4s, v6.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v22.4s, v10.4s, v5.s[3]\n"
                        "fmla v26.4s, v10.4s, v6.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "fmla v23.4s, v11.4s, v5.s[3]\n"
                        "fmla v27.4s, v11.4s, v6.s[3]\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q5, [a_ptr1]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr q6, [a_ptr2]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "fmla v24.4s, v8.4s, v2.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "fmla v25.4s, v9.4s, v2.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "fmla v26.4s, v10.4s, v2.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "fmla v27.4s, v11.4s, v2.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v24.4s, v8.4s, v2.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v25.4s, v9.4s, v2.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v26.4s, v10.4s, v2.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "fmla v27.4s, v11.4s, v2.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "fmla v20.4s, v8.4s, v5.s[1]\n"
                        "fmla v24.4s, v8.4s, v6.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "fmla v21.4s, v9.4s, v5.s[1]\n"
                        "fmla v25.4s, v9.4s, v6.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "fmla v22.4s, v10.4s, v5.s[1]\n"
                        "fmla v26.4s, v10.4s, v6.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "fmla v23.4s, v11.4s, v5.s[1]\n"
                        "fmla v27.4s, v11.4s, v6.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v20.4s, v8.4s, v5.s[3]\n"
                        "fmla v24.4s, v8.4s, v6.s[3]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v21.4s, v9.4s, v5.s[3]\n"
                        "fmla v25.4s, v9.4s, v6.s[3]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v22.4s, v10.4s, v5.s[3]\n"
                        "fmla v26.4s, v10.4s, v6.s[3]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "fmla v23.4s, v11.4s, v5.s[3]\n"
                        "fmla v27.4s, v11.4s, v6.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "fmla v24.4s, v8.4s, v2.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "fmla v25.4s, v9.4s, v2.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "fmla v26.4s, v10.4s, v2.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "fmla v27.4s, v11.4s, v2.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v24.4s, v8.4s, v2.s[3]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v25.4s, v9.4s, v2.s[3]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v26.4s, v10.4s, v2.s[3]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "fmla v27.4s, v11.4s, v2.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr s1, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "fmax v20.4s, v20.4s, v14.4s\n"
                        "fmax v21.4s, v21.4s, v14.4s\n"
                        "fmax v22.4s, v22.4s, v14.4s\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "fmax v23.4s, v23.4s, v14.4s\n"
                        "fmin v20.4s, v20.4s, v15.4s\n"
                        "fmin v21.4s, v21.4s, v15.4s\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "fmin v22.4s, v22.4s, v15.4s\n"
                        "fmin v23.4s, v23.4s, v15.4s\n"
                        "fmax v24.4s, v24.4s, v14.4s\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "fmax v25.4s, v25.4s, v14.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        "fmax v26.4s, v26.4s, v14.4s\n"
                        "str q20, [c_ptr1]\n"
                        "fmin v24.4s, v24.4s, v15.4s\n"
                        "fmin v25.4s, v25.4s, v15.4s\n"
                        "fmax v27.4s, v27.4s, v14.4s\n"
                        "str q21, [c_ptr1, #0x10]\n"
                        "fmin v26.4s, v26.4s, v15.4s\n"
                        "fmin v27.4s, v27.4s, v15.4s\n"
                        "str q22, [c_ptr1, #0x20]\n"
                        "str q23, [c_ptr1, #0x30]\n"
                        "str q24, [c_ptr2]\n"
                        "str q25, [c_ptr2, #0x10]\n"
                        "str q26, [c_ptr2, #0x20]\n"
                        "str q27, [c_ptr2, #0x30]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                    );
                    break;
                default:
                case 4:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "c_ptr1 .req X3\n"
                        "c_ptr2 .req X4\n"
                        "c_ptr3 .req X5\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "cbnz %[accumulate], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "mov v20.16b, v16.16b\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "mov v21.16b, v17.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v22.16b, v18.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v23.16b, v19.16b\n"
                        "ldr q3, [a_ptr3]\n"
                        "mov v24.16b, v16.16b\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "mov v25.16b, v17.16b\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "mov v26.16b, v18.16b\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "mov v27.16b, v19.16b\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "mov v28.16b, v16.16b\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "mov v29.16b, v17.16b\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "mov v30.16b, v18.16b\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "mov v31.16b, v19.16b\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q20, [c_ptr1]\n"
                        "ldr q21, [c_ptr1, #0x10]\n"
                        "ldr q22, [c_ptr1, #0x20]\n"
                        "ldr q23, [c_ptr1, #0x30]\n"
                        "ldr q24, [c_ptr2]\n"
                        "ldr q25, [c_ptr2, #0x10]\n"
                        "ldr q26, [c_ptr2, #0x20]\n"
                        "ldr q27, [c_ptr2, #0x30]\n"
                        "ldr q28, [c_ptr3]\n"
                        "ldr q29, [c_ptr3, #0x10]\n"
                        "ldr q30, [c_ptr3, #0x20]\n"
                        "ldr q31, [c_ptr3, #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q2, [a_ptr2]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "ldr q3, [a_ptr3]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr q5, [a_ptr1]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "ldr q6, [a_ptr2]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q7, [a_ptr3]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "fmla v24.4s, v8.4s, v2.s[1]\n"
                        "fmla v28.4s, v8.4s, v3.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "fmla v25.4s, v9.4s, v2.s[1]\n"
                        "fmla v29.4s, v9.4s, v3.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "fmla v26.4s, v10.4s, v2.s[1]\n"
                        "fmla v30.4s, v10.4s, v3.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "fmla v27.4s, v11.4s, v2.s[1]\n"
                        "fmla v31.4s, v11.4s, v3.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "fmla v28.4s, v8.4s, v3.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "fmla v29.4s, v9.4s, v3.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "fmla v30.4s, v10.4s, v3.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "fmla v31.4s, v11.4s, v3.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v24.4s, v8.4s, v2.s[3]\n"
                        "fmla v28.4s, v8.4s, v3.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v25.4s, v9.4s, v2.s[3]\n"
                        "fmla v29.4s, v9.4s, v3.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v26.4s, v10.4s, v2.s[3]\n"
                        "fmla v30.4s, v10.4s, v3.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v27.4s, v11.4s, v2.s[3]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v31.4s, v11.4s, v3.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr q3, [a_ptr3, #-0x10]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "fmla v28.4s, v8.4s, v7.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "fmla v29.4s, v9.4s, v7.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "fmla v30.4s, v10.4s, v7.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "fmla v31.4s, v11.4s, v7.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "fmla v20.4s, v8.4s, v5.s[1]\n"
                        "fmla v24.4s, v8.4s, v6.s[1]\n"
                        "fmla v28.4s, v8.4s, v7.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "fmla v21.4s, v9.4s, v5.s[1]\n"
                        "fmla v25.4s, v9.4s, v6.s[1]\n"
                        "fmla v29.4s, v9.4s, v7.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "fmla v22.4s, v10.4s, v5.s[1]\n"
                        "fmla v26.4s, v10.4s, v6.s[1]\n"
                        "fmla v30.4s, v10.4s, v7.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "fmla v23.4s, v11.4s, v5.s[1]\n"
                        "fmla v27.4s, v11.4s, v6.s[1]\n"
                        "fmla v31.4s, v11.4s, v7.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "fmla v28.4s, v8.4s, v7.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "fmla v29.4s, v9.4s, v7.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "fmla v30.4s, v10.4s, v7.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "fmla v31.4s, v11.4s, v7.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "fmla v20.4s, v8.4s, v5.s[3]\n"
                        "fmla v24.4s, v8.4s, v6.s[3]\n"
                        "fmla v28.4s, v8.4s, v7.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v21.4s, v9.4s, v5.s[3]\n"
                        "fmla v25.4s, v9.4s, v6.s[3]\n"
                        "fmla v29.4s, v9.4s, v7.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v22.4s, v10.4s, v5.s[3]\n"
                        "fmla v26.4s, v10.4s, v6.s[3]\n"
                        "fmla v30.4s, v10.4s, v7.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "fmla v23.4s, v11.4s, v5.s[3]\n"
                        "fmla v27.4s, v11.4s, v6.s[3]\n"
                        "fmla v31.4s, v11.4s, v7.s[3]\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr q4, [%[a_ptr0]]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr q5, [a_ptr1]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr q6, [a_ptr2]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "ldr q7, [a_ptr3]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "fmla v24.4s, v8.4s, v2.s[1]\n"
                        "fmla v28.4s, v8.4s, v3.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "fmla v25.4s, v9.4s, v2.s[1]\n"
                        "fmla v29.4s, v9.4s, v3.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "fmla v26.4s, v10.4s, v2.s[1]\n"
                        "fmla v30.4s, v10.4s, v3.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "fmla v27.4s, v11.4s, v2.s[1]\n"
                        "fmla v31.4s, v11.4s, v3.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "fmla v28.4s, v8.4s, v3.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "fmla v29.4s, v9.4s, v3.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "fmla v30.4s, v10.4s, v3.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "fmla v31.4s, v11.4s, v3.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v24.4s, v8.4s, v2.s[3]\n"
                        "fmla v28.4s, v8.4s, v3.s[3]\n"
                        "ldr q8, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v25.4s, v9.4s, v2.s[3]\n"
                        "fmla v29.4s, v9.4s, v3.s[3]\n"
                        "ldr q9, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v26.4s, v10.4s, v2.s[3]\n"
                        "fmla v30.4s, v10.4s, v3.s[3]\n"
                        "ldr q10, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "fmla v27.4s, v11.4s, v2.s[3]\n"
                        "fmla v31.4s, v11.4s, v3.s[3]\n"
                        "ldr q11, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "fmla v28.4s, v8.4s, v7.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "fmla v29.4s, v9.4s, v7.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "fmla v30.4s, v10.4s, v7.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "fmla v31.4s, v11.4s, v7.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v4.s[1]\n"
                        "fmla v20.4s, v8.4s, v5.s[1]\n"
                        "fmla v24.4s, v8.4s, v6.s[1]\n"
                        "fmla v28.4s, v8.4s, v7.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v4.s[1]\n"
                        "fmla v21.4s, v9.4s, v5.s[1]\n"
                        "fmla v25.4s, v9.4s, v6.s[1]\n"
                        "fmla v29.4s, v9.4s, v7.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v4.s[1]\n"
                        "fmla v22.4s, v10.4s, v5.s[1]\n"
                        "fmla v26.4s, v10.4s, v6.s[1]\n"
                        "fmla v30.4s, v10.4s, v7.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[1]\n"
                        "fmla v23.4s, v11.4s, v5.s[1]\n"
                        "fmla v27.4s, v11.4s, v6.s[1]\n"
                        "fmla v31.4s, v11.4s, v7.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "fmla v28.4s, v8.4s, v7.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "fmla v29.4s, v9.4s, v7.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "fmla v30.4s, v10.4s, v7.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "fmla v31.4s, v11.4s, v7.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v4.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v20.4s, v8.4s, v5.s[3]\n"
                        "fmla v24.4s, v8.4s, v6.s[3]\n"
                        "fmla v28.4s, v8.4s, v7.s[3]\n"
                        "fmla v17.4s, v9.4s, v4.s[3]\n"
                        "fmla v21.4s, v9.4s, v5.s[3]\n"
                        "fmla v25.4s, v9.4s, v6.s[3]\n"
                        "fmla v29.4s, v9.4s, v7.s[3]\n"
                        "fmla v18.4s, v10.4s, v4.s[3]\n"
                        "fmla v22.4s, v10.4s, v5.s[3]\n"
                        "fmla v26.4s, v10.4s, v6.s[3]\n"
                        "fmla v30.4s, v10.4s, v7.s[3]\n"
                        "fmla v19.4s, v11.4s, v4.s[3]\n"
                        "fmla v23.4s, v11.4s, v5.s[3]\n"
                        "fmla v27.4s, v11.4s, v6.s[3]\n"
                        "fmla v31.4s, v11.4s, v7.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v8.4s, v0.s[1]\n"
                        "fmla v20.4s, v8.4s, v1.s[1]\n"
                        "fmla v24.4s, v8.4s, v2.s[1]\n"
                        "fmla v28.4s, v8.4s, v3.s[1]\n"
                        "ldr q8, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v9.4s, v0.s[1]\n"
                        "fmla v21.4s, v9.4s, v1.s[1]\n"
                        "fmla v25.4s, v9.4s, v2.s[1]\n"
                        "fmla v29.4s, v9.4s, v3.s[1]\n"
                        "ldr q9, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v10.4s, v0.s[1]\n"
                        "fmla v22.4s, v10.4s, v1.s[1]\n"
                        "fmla v26.4s, v10.4s, v2.s[1]\n"
                        "fmla v30.4s, v10.4s, v3.s[1]\n"
                        "ldr q10, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[1]\n"
                        "fmla v23.4s, v11.4s, v1.s[1]\n"
                        "fmla v27.4s, v11.4s, v2.s[1]\n"
                        "fmla v31.4s, v11.4s, v3.s[1]\n"
                        "ldr q11, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "fmla v28.4s, v8.4s, v3.s[2]\n"
                        "ldr q8, [%[b_ptr0], #-0x80]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "fmla v29.4s, v9.4s, v3.s[2]\n"
                        "ldr q9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "fmla v30.4s, v10.4s, v3.s[2]\n"
                        "ldr q10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "fmla v31.4s, v11.4s, v3.s[2]\n"
                        "ldr q11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
                        "fmla v20.4s, v8.4s, v1.s[3]\n"
                        "fmla v24.4s, v8.4s, v2.s[3]\n"
                        "fmla v28.4s, v8.4s, v3.s[3]\n"
                        "fmla v17.4s, v9.4s, v0.s[3]\n"
                        "fmla v21.4s, v9.4s, v1.s[3]\n"
                        "fmla v25.4s, v9.4s, v2.s[3]\n"
                        "fmla v29.4s, v9.4s, v3.s[3]\n"
                        "fmla v18.4s, v10.4s, v0.s[3]\n"
                        "fmla v22.4s, v10.4s, v1.s[3]\n"
                        "fmla v26.4s, v10.4s, v2.s[3]\n"
                        "fmla v30.4s, v10.4s, v3.s[3]\n"
                        "fmla v19.4s, v11.4s, v0.s[3]\n"
                        "fmla v23.4s, v11.4s, v1.s[3]\n"
                        "fmla v27.4s, v11.4s, v2.s[3]\n"
                        "fmla v31.4s, v11.4s, v3.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr s1, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "fmax v20.4s, v20.4s, v14.4s\n"
                        "fmax v21.4s, v21.4s, v14.4s\n"
                        "fmax v22.4s, v22.4s, v14.4s\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "fmax v23.4s, v23.4s, v14.4s\n"
                        "fmin v20.4s, v20.4s, v15.4s\n"
                        "fmin v21.4s, v21.4s, v15.4s\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "fmin v22.4s, v22.4s, v15.4s\n"
                        "fmin v23.4s, v23.4s, v15.4s\n"
                        "fmax v24.4s, v24.4s, v14.4s\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "fmax v25.4s, v25.4s, v14.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        "fmax v26.4s, v26.4s, v14.4s\n"
                        "str q20, [c_ptr1]\n"
                        "fmin v24.4s, v24.4s, v15.4s\n"
                        "fmin v25.4s, v25.4s, v15.4s\n"
                        "fmax v27.4s, v27.4s, v14.4s\n"
                        "str q21, [c_ptr1, #0x10]\n"
                        "fmin v26.4s, v26.4s, v15.4s\n"
                        "fmax v28.4s, v28.4s, v14.4s\n"
                        "fmax v29.4s, v29.4s, v14.4s\n"
                        "str q22, [c_ptr1, #0x20]\n"
                        "fmin v27.4s, v27.4s, v15.4s\n"
                        "fmax v30.4s, v30.4s, v14.4s\n"
                        "fmin v28.4s, v28.4s, v15.4s\n"
                        "str q23, [c_ptr1, #0x30]\n"
                        "fmin v29.4s, v29.4s, v15.4s\n"
                        "fmax v31.4s, v31.4s, v14.4s\n"
                        "fmin v30.4s, v30.4s, v15.4s\n"
                        "str q24, [c_ptr2]\n"
                        "fmin v31.4s, v31.4s, v15.4s\n"
                        "str q25, [c_ptr2, #0x10]\n"
                        "str q26, [c_ptr2, #0x20]\n"
                        "str q27, [c_ptr2, #0x30]\n"
                        "str q28, [c_ptr3]\n"
                        "str q29, [c_ptr3, #0x10]\n"
                        "str q30, [c_ptr3, #0x20]\n"
                        "str q31, [c_ptr3, #0x30]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                    );
                    break;
            }
            if (use_result_buffer) {
                for(int cy=0; cy<std::min(M-y, 4); cy++) {
                    for(unsigned int cx=0; cx<width; cx++) {
                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
                    }
                }
            }
        }
    }
}

} // namespace arm_gemm

#endif // __aarch64__
